########################################################################
# DATA LOAD AND CLEAN
########################################################################


final_data <- read_excel(here("data", "thesis_data_final.xlsx"))
final_data <- final_data %>% drop_na() %>% select_if(~n_distinct(.) > 1)
final_data <- final_data %>% mutate_if(is.character,as.factor)
final_data_scaled <- final_data
final_data_scaled$Age <- scale(final_data$Age)
final_data_scaled$BMI <- scale(final_data$BMI)
########################################################################
# MODEL SELECTION FOR STEPWISE TEST
########################################################################

#step.model <-glm(formula = Gender~ ., family = "binomial", 
#   data = data_train)
#summary(step.model)
#step <- stepAIC(step.model, direction="both")
#step

Step wise model is determined to be glm(formula = Gender ~ CM_IHD + Lung_Disease + Symptom_Fever + Symptom_Nausea_Vomiting, family = “binomial”, data = data_train)

########################################################################
# SPLIT DATA INTO TRAINING AND TEST SETS
########################################################################

#Set a random seed, in this way we can replicate the exact splits later on if needed
set.seed(568)

#Make the split rule
splits <- initial_split(final_data, prop = 0.7, strata = Gender)

#Use the split rule to retrieve the training data
data_train <- training(splits)

#Use the split rule to retrieve the test data
data_test  <- testing(splits)

#Make a version of the test data where the outcome is not included. 
test_no_outcome <- data_test %>% dplyr::select(-Gender)
test_y <- data_test %>% dplyr::select(Gender)
test_x <- data_test %>% dplyr::select(-Gender)
train_y <- data_train %>% dplyr::select(Gender)
train_x <- data_train %>% dplyr::select(-Gender)
save(data_test, file = "data_test.RData")
save(test_x, file = "test_x.RData")
########################################################################
# CREATE CROSS-VALIDATION FOLDS FOR MODEL EVALUATION/COMPARISON
########################################################################
#Prepare for 10-fold cross-validation, observations selected into folds with random 
#sampling stratified on outcome
set.seed(877)
folds <- vfold_cv(data_train, v =10, strata = Gender)
#folds <- loo_cv(data_train)
save(folds, file = "cv_folds.RData")
########################################################################
# EVALUATION METRICS
########################################################################
#Which metrics should be computed?
my_metrics <- metric_set(roc_auc, recall, specificity, sensitivity, accuracy, bal_accuracy)
########################################################################
# CREATE RECIPE FOR PREPROCESSING DATA
########################################################################
#Create recipe for preprocessing data: undersampling majority class, categorical variables into dummy variables etc
train_rec <- 
  recipe(Gender ~ ., data = data_train) %>%   
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) 

stepwise_rec <- 
  recipe(Gender ~ CM_IHD + Lung_Disease + Symptom_Fever + 
    Symptom_Nausea_Vomiting, data = data_train) %>%   
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) 
########################################
# MODEL 1: Logistic regression
########################################

#Model specification
lr_mod <-
  logistic_reg() %>%
  set_engine("glm")

#Work flow: Which model to use and how data should be preprocessed
lr_wflow <-
  workflow() %>%
  add_model(lr_mod) %>%
  add_recipe(train_rec)

#Use the workflow and folds object to fit model on cross-validation resamples
lr_fit_rs <- 
  lr_wflow %>% 
  fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
## ! Fold01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
## ! Fold04: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
#Get mean out-of-sample performance measures
lr_metrics <- collect_metrics(lr_fit_rs)
lr_metrics
#Store part of the metrics object for later comparison with other models
lr_metrics_sub <- lr_metrics[ , c(1,3,5)]
lr_metrics_sub <- lr_metrics_sub %>% 
  pivot_longer(!.metric, names_to = "measure", values_to = ".estimate")

#Fit the above logistic regression model on the full training data
lr_fit_train <- 
  lr_wflow %>%
  fit(data = data_train)

#Look at the model summary
summary(lr_fit_train$fit$fit$fit)
## 
## Call:
## stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0956  -0.9002   0.5103   0.8861   2.0058  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                    -0.7389     1.0708  -0.690   0.4902  
## Age                            -0.2554     0.3003  -0.850   0.3951  
## BMI                            -0.3526     0.2890  -1.220   0.2226  
## CM_DM_Yes                       0.1324     0.5734   0.231   0.8174  
## CM_HTM_Yes                     -0.2804     0.5949  -0.471   0.6375  
## CM_IHD_Yes                      1.1068     1.0408   1.063   0.2876  
## Lung_Disease_Yes               -1.8690     1.3828  -1.352   0.1765  
## Symptom_Fever_Yes               0.9089     0.7202   1.262   0.2069  
## Symptom_Cough_Yes              -0.5054     0.6279  -0.805   0.4209  
## Symptom_Sputum_Yes             -0.2885     0.7262  -0.397   0.6911  
## Symptom_Runny_Nose_Yes         -0.9847     1.3959  -0.705   0.4805  
## Symptom_Breathlessness_Yes      0.8674     0.5803   1.495   0.1349  
## Symptom_Abdominal_Pain_Yes     -0.7399     0.9642  -0.767   0.4428  
## Symptom_Nausea_Vomiting_Yes    -1.4820     0.7187  -2.062   0.0392 *
## Symptom_Diarrhoea_Yes           0.4630     0.7718   0.600   0.5486  
## Symptom_Anosmia_Hyposmia_Yes   -0.7717     1.3778  -0.560   0.5754  
## Symptom_Red_Eye_Yes           -17.2961  2399.5449  -0.007   0.9942  
## Symptom_Skin_Rash_Yes         -17.0013  2399.5449  -0.007   0.9943  
## Outcome_Discharged              0.8281     0.7882   1.051   0.2934  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 128.05  on 92  degrees of freedom
## Residual deviance: 102.65  on 74  degrees of freedom
## AIC: 140.65
## 
## Number of Fisher Scoring iterations: 15
#Get the predicted class probabilities computed for the full training data
lr_pred_prob_train <- predict(lr_fit_train , type = "prob", new_data =  data_train)
#Get the receiver operator curve (ROC) computed for the full training data
lr_train_roc <-roc_curve(tibble(Gender = data_train$Gender, lr_pred_prob_train), truth = Gender, .estimate = .pred_Female) %>% 
  mutate(model = "Log_reg")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the yardstick package.
##   Please report the issue at <https://github.com/tidymodels/yardstick/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
lr_pred_class_test <- predict(lr_fit_train , type = "class", new_data =  data_test)
lr_pred_prob_test <- predict(lr_fit_train , type = "prob", new_data =  data_test)


#When you have test data without outcome
lr_pred_class_test_x <- predict(lr_fit_train , type = "class", new_data =  test_no_outcome)
lr_pred_prob_test_x<- predict(lr_fit_train , type = "prob", new_data =  test_no_outcome)
################################################
# MODEL 2: Stepwise logistic regression 
################################################

#Model specification
step_mod <-
  logistic_reg() %>%
  set_engine("glm")

#Work flow: Which model to use and how data should be preprocessed
step_wflow <-
  workflow() %>%
  add_model(step_mod) %>%
  add_recipe(stepwise_rec)

#Use the workflow and folds object to fit model on cross-validation resamples
step_fit_rs <- 
  step_wflow %>% 
  fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))

#Get mean out-of-sample performance measures
step_metrics <- collect_metrics(step_fit_rs)
step_metrics
#Store part of the metrics object for later comparison with other models
step_metrics_sub <- step_metrics[ , c(1,3,5)]
step_metrics_sub <- step_metrics_sub %>% 
  pivot_longer(!.metric, names_to = "measure", values_to = ".estimate")

#Fit the above logistic regression model on the full training data
step_fit_train <- 
  step_wflow %>%
  fit(data = data_train)

#Look at the model summary
summary(step_fit_train$fit$fit$fit)
## 
## Call:
## stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9318  -1.0373   0.8933   0.8933   2.0162  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                  -0.3388     0.5767  -0.588  0.55686   
## CM_IHD_Yes                    0.9852     0.8424   1.170  0.24219   
## Lung_Disease_Yes             -1.7662     1.3218  -1.336  0.18148   
## Symptom_Fever_Yes             1.0514     0.6095   1.725  0.08452 . 
## Symptom_Nausea_Vomiting_Yes  -1.5532     0.5547  -2.800  0.00511 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 128.05  on 92  degrees of freedom
## Residual deviance: 113.89  on 88  degrees of freedom
## AIC: 123.89
## 
## Number of Fisher Scoring iterations: 4
#Get the predicted class probabilities computed for the full training data
step_pred_prob_train <- predict(step_fit_train , type = "prob", new_data =  data_train)
#Get the receiver operator curve (ROC) computed for the full training data
stepwise_train_roc <-roc_curve(tibble(Gender = data_train$Gender, step_pred_prob_train), truth = Gender, .estimate = .pred_Female) %>% 
  mutate(model = "Step_reg")

#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
step_pred_class_test <- predict(step_fit_train , type = "class", new_data =  data_test)
step_pred_prob_test <- predict(step_fit_train , type = "prob", new_data =  data_test)


#When you have test data without outcome
step_pred_class_test_x <- predict(step_fit_train , type = "class", new_data =  test_no_outcome)
step_pred_prob_test_x<- predict(step_fit_train , type = "prob", new_data =  test_no_outcome)
################################################
# MODEL 3: Penalized logistic regression (Ridge)
################################################

#Model specification
ridge_mod <- 
  logistic_reg(mixture = 0, penalty = tune()) %>% 
  set_engine("glmnet") %>%
  set_mode("classification") 
lambda_grid <- grid_regular(penalty(), levels = 100)
#Set up workflow
ridge_wflow <-
  workflow() %>%
  add_model(ridge_mod) %>%
  add_recipe(train_rec)

#Get a parameter object for our data and model specification. Contains information about possible values, ranges, types etc.
ridge_param <-
  ridge_wflow %>%
  parameters() %>% 
  finalize(data_train)
## Warning: `parameters.workflow()` was deprecated in tune 0.1.6.9003.
## ℹ Please use `hardhat::extract_parameter_set_dials()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Look at the range for the penalty parameter
ridge_param %>% pull_dials_object("penalty")
## Warning: `pull_dials_object()` was deprecated in dials 0.1.0.
## ℹ Please use `hardhat::extract_parameter_dials()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Amount of Regularization (quantitative)
## Transformer: log-10 [1e-100, Inf]
## Range (transformed scale): [-10, 0]
#Tune the model: Set up a grid of penalty values to be evalutated and select the optimal penalty value (in terms of AUROC)
set.seed(99)
 ridge_tune <-
   ridge_wflow %>%
   tune_grid(
     folds,
      grid = ridge_param %>% grid_regular(levels = c(penalty = 200)),
     metrics = my_metrics
   )


#View plot of penalty values vs. AUROC
autoplot(ridge_tune) + theme(legend.position = "top")

#View the penalty values with largest AUROC
show_best(ridge_tune) %>% select(-.estimator)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
#Store the best penalty value
ridge_best_param <- select_best(ridge_tune, "roc_auc")

#Set up the final workflow using the best penalty value
final_ridge_wflow <- 
  ridge_wflow %>% 
  finalize_workflow(ridge_best_param)

#View the workflow specifiations
final_ridge_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 1
##   mixture = 0
## 
## Computational engine: glmnet
#Fit the final model on the cross-validation folds set up for model evaluation/comparison
ridge_fit_rs <- 
  final_ridge_wflow %>% 
  fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))

#Get mean out-of-sample performance measures
ridge_metrics <- collect_metrics(ridge_fit_rs)
ridge_metrics
#Store part of the metrics object for later comparison with other models
ridge_metrics_sub <- ridge_metrics[, c(1,3,5)]
ridge_metrics_sub <- ridge_metrics_sub %>% 
  pivot_longer(!.metric, names_to = "measure", values_to = "estimate")


#Fit the final model on the full training data
ridge_fit_train <- 
  final_ridge_wflow %>%
  fit(data = data_train)


#Look at variable importance
ridge_fit_train%>% 
  pull_workflow_fit() %>% 
  vip(lambda = ridge_best_param$penalty, num_features = 200)
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## ℹ Please use `extract_fit_parsnip()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Get the model coefficients
ridge_coeff <- data.frame(ridge_fit_train %>%
                            pull_workflow_fit() %>%
                            tidy())

#Number of non-zero coefficients
sum(ridge_coeff$estimate != 0)
## [1] 19
#Number of zero coefficients
sum(ridge_coeff$estimate == 0)
## [1] 0
#Get the predicted class probabilities computed for the full training data
ridge_pred_prob_train <- predict(ridge_fit_train , type = "prob", new_data =  data_train)
#Get the receiver operator curve (ROC) computed for the full training data
ridge_train_roc <- roc_curve(tibble(Gender = data_train$Gender, ridge_pred_prob_train), truth = Gender, estimate =.pred_Female)  %>% 
  mutate(model = "Ridge_reg")

#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
ridge_pred_class_test <- predict(ridge_fit_train , type = "class", new_data =  data_test)
ridge_pred_prob_test <- predict(ridge_fit_train , type = "prob", new_data =  data_test)
#When you have test data without outcome
ridge_pred_class_test_no_outcome <- predict(ridge_fit_train , type = "class", new_data =  test_no_outcome)
ridge_pred_prob_test_no_outcome <- predict(ridge_fit_train , type = "prob", new_data =  test_no_outcome)
################################################
# MODEL 4: Penalized logistic regression (LASSO)
################################################

#Model specification
lasso_mod <- 
  logistic_reg(mixture = 1, penalty = tune()) %>% #Specify that we want to tune the penalty parameter
  set_engine("glmnet") %>%
  set_mode("classification") 
lambda_grid <- grid_regular(penalty(), levels = 100)
#Set up workflow
lasso_wflow <-
  workflow() %>%
  add_model(lasso_mod) %>%
  add_recipe(train_rec)

#Get a parameter object for our data and model specification. Contains information about possible values, ranges, types etc.
lasso_param <-
  lasso_wflow %>%
  parameters() %>% 
  finalize(data_train)

#Look at the range for the penalty parameter
lasso_param%>% pull_dials_object("penalty")
## Amount of Regularization (quantitative)
## Transformer: log-10 [1e-100, Inf]
## Range (transformed scale): [-10, 0]
#Tune the model: Set up a grid of penalty values to be evalutated and select the optimal penalty value (in terms of AUROC)
set.seed(99)
 lasso_tune <-
   lasso_wflow %>%
   tune_grid(
     folds,
      grid = lasso_param %>% grid_regular(levels = c(penalty = 200)),
     metrics = my_metrics
   )


#View plot of penalty values vs. AUROC
autoplot(lasso_tune) + theme(legend.position = "top")

#View the penalty values with largest AUROC
show_best(lasso_tune) %>% select(-.estimator)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
#Store the best penalty value
lasso_best_param <- select_best(lasso_tune, "roc_auc")

#Set up the final workflow using the best penalty value
final_lasso_wflow <- 
  lasso_wflow %>% 
  finalize_workflow(lasso_best_param)

#View the workflow specifiations
final_lasso_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 0.124588336429501
##   mixture = 1
## 
## Computational engine: glmnet
#Fit the final model on the cross-validation folds set up for model evaluation/comparison
lasso_fit_rs <- 
  final_lasso_wflow %>% 
  fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))

#Get mean out-of-sample performance measures
lasso_metrics <- collect_metrics(lasso_fit_rs)
lasso_metrics
#Store part of the metrics object for later comparison with other models
lasso_metrics_sub <- lasso_metrics[, c(1,3,5)]
lasso_metrics_sub <- lasso_metrics_sub %>% 
  pivot_longer(!.metric, names_to = "measure", values_to = "estimate")


#Fit the final model on the full training data
lasso_fit_train <- 
  final_lasso_wflow %>%
  fit(data = data_train)


#Look at variable importance
lasso_fit_train%>% 
  pull_workflow_fit() %>% 
  vip(lambda = lasso_best_param$penalty, num_features = 200)

#Get the model coefficients
lasso_coeff <- data.frame(lasso_fit_train %>%
                            pull_workflow_fit() %>%
                            tidy())

#Number of non-zero coefficients
sum(lasso_coeff$estimate != 0)
## [1] 2
#Number of zero coefficients
sum(lasso_coeff$estimate == 0)
## [1] 17
#Get the predicted class probabilities computed for the full training data
lasso_pred_prob_train <- predict(lasso_fit_train , type = "prob", new_data =  data_train)
#Get the receiver operator curve (ROC) computed for the full training data
lasso_train_roc <- roc_curve(tibble(Gender = data_train$Gender, lasso_pred_prob_train), truth = Gender, estimate =.pred_Female)  %>% 
  mutate(model = "Lasso_reg")

#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
lasso_pred_class_test <- predict(lasso_fit_train , type = "class", new_data =  data_test)
lasso_pred_prob_test <- predict(lasso_fit_train , type = "prob", new_data =  data_test)
#When you have test data without outcome
lasso_pred_class_test_no_outcome <- predict(lasso_fit_train , type = "class", new_data =  test_no_outcome)
lasso_pred_prob_test_no_outcome <- predict(lasso_fit_train , type = "prob", new_data =  test_no_outcome)
################################################
# MODEL 5: Logistic elastic net regression (ELNET)
################################################

#Model specification
elnet_mod <- 
  logistic_reg(mixture = tune(), penalty = tune()) %>% #Specify that we want to tune the penalty parameter and the mixture
  set_engine("glmnet") %>%
  set_mode("classification") 
lambda_grid <- grid_regular(penalty(), levels = 50)
#Set up workflow
elnet_wflow <-
  workflow() %>%
  add_model(elnet_mod) %>%
  add_recipe(train_rec)

#Get a parameter object for our data and model specification. Contains information about possible values, ranges, types etc.
elnet_param <-
  elnet_wflow %>%
  parameters() %>% 
  finalize(data_train)

#Look at the range for the penalty parameter
elnet_param%>% pull_dials_object("penalty")
## Amount of Regularization (quantitative)
## Transformer: log-10 [1e-100, Inf]
## Range (transformed scale): [-10, 0]
#Tune the model: Set up a grid of penalty values to be evalutated and select the optimal penalty value (in terms of AUROC)
set.seed(99)
 elnet_tune <-
   elnet_wflow %>%
   tune_grid(
     folds,
      grid = elnet_param %>% grid_regular(levels = c(penalty = 100, mixture = 10)),
     metrics = my_metrics
   )


#View plot of penalty values vs. AUROC
autoplot(elnet_tune) + theme(legend.position = "top")

#View the penalty values with largest AUROC
show_best(elnet_tune) %>% select(-.estimator)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
#Store the best penalty value
elnet_best_param <- select_best(elnet_tune, "roc_auc")

#Set up the final workflow using the best penalty value
final_elnet_wflow <- 
  elnet_wflow %>% 
  finalize_workflow(elnet_best_param)

#View the workflow specifiations
final_elnet_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 0.792482898353919
##   mixture = 0.05
## 
## Computational engine: glmnet
#Fit the final model on the cross-validation folds set up for model evaluation/comparison
elnet_fit_rs <- 
  final_elnet_wflow %>% 
  fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))

#Get mean out-of-sample performance measures
elnet_metrics <- collect_metrics(elnet_fit_rs)
elnet_metrics
#Store part of the metrics object for later comparison with other models
elnet_metrics_sub <- elnet_metrics[, c(1,3,5)]
elnet_metrics_sub <- elnet_metrics_sub %>% 
  pivot_longer(!.metric, names_to = "measure", values_to = "estimate")


#Fit the final model on the full training data
elnet_fit_train <- 
  final_elnet_wflow %>%
  fit(data = data_train)


#Look at variable importance
elnet_fit_train%>% 
  pull_workflow_fit() %>% 
  vip(lambda = elnet_best_param$penalty, num_features = 200)

#Get the model coefficients
elnet_coeff <- data.frame(elnet_fit_train %>%
                            pull_workflow_fit() %>%
                            tidy())

#Number of non-zero coefficients
sum(elnet_coeff$estimate != 0)
## [1] 12
#Number of zero coefficients
sum(elnet_coeff$estimate == 0)
## [1] 7
#Get the predicted class probabilities computed for the full training data
elnet_pred_prob_train <- predict(elnet_fit_train , type = "prob", new_data =  data_train)
#Get the receiver operator curve (ROC) computed for the full training data
elnet_train_roc <- roc_curve(tibble(Gender = data_train$Gender, elnet_pred_prob_train), truth = Gender, estimate =.pred_Female)  %>% 
  mutate(model = "Elnet")

#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
elnet_pred_class_test <- predict(elnet_fit_train , type = "class", new_data =  data_test)
elnet_pred_prob_test <- predict(elnet_fit_train , type = "prob", new_data =  data_test)
#When you have test data without outcome
elnet_pred_class_test_no_outcome <- predict(elnet_fit_train , type = "class", new_data =  test_no_outcome)
elnet_pred_prob_test_no_outcome <- predict(elnet_fit_train , type = "prob", new_data =  test_no_outcome)
library(inspectdf)
library(skimr)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(MuMIn)
library(plotmo)
## Loading required package: Formula
## Loading required package: plotrix
## 
## Attaching package: 'plotrix'
## The following object is masked from 'package:scales':
## 
##     rescale
## Loading required package: TeachingDemos
library(separationplot)
## Loading required package: RColorBrewer
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:TeachingDemos':
## 
##     cnvrt.coords, subplot
## The following objects are masked from 'package:table1':
## 
##     label, label<-, units
## The following object is masked from 'package:parsnip':
## 
##     translate
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: foreign
library(ggfortify)
## Registered S3 method overwritten by 'ggfortify':
##   method          from   
##   autoplot.glmnet parsnip
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(cvms)
library(broom)
library(yardstick)
library(glmnet)
library(glmtoolbox)
## 
## Attaching package: 'glmtoolbox'
## The following object is masked from 'package:MuMIn':
## 
##     QIC
######################################################
# SUMMARIZE RESULTS BASED ON TRAINING DATA EVALUATIONS
######################################################

#Combine the results from different models in one tibble
metrics_table_data_train <- bind_cols(lr_metrics_sub[c(11:12, 1:10, 13:14), 1:3], step_metrics_sub [c(11:12, 1:10, 13:14), 1:3],ridge_metrics_sub[c(11:12, 1:10, 13:14), 3], lasso_metrics_sub[c(11:12, 1:10, 13:14), 3]  , elnet_metrics_sub[c(11:12, 1:10, 13:14), 3])
## New names:
## • `.metric` -> `.metric...1`
## • `measure` -> `measure...2`
## • `.estimate` -> `.estimate...3`
## • `.metric` -> `.metric...4`
## • `measure` -> `measure...5`
## • `.estimate` -> `.estimate...6`
## • `estimate` -> `estimate...7`
## • `estimate` -> `estimate...8`
## • `estimate` -> `estimate...9`
colnames(metrics_table_data_train) <- c("Metric", "Measure", "Log_reg", "Step_reg", "Ridge_reg", "Lasso_reg", "Elnet")

#Convert the tibble to a data.frame
results_table_train <- data.frame(metrics_table_data_train)

#Produce a table with results based on training data
results_table_train %>%
  kbl(caption = "Table 3. Model performance based on 10-fold CV on training data (n = 94)",  digits = 3) %>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  collapse_rows(columns = 1, valign = "top")
Table 3. Model performance based on 10-fold CV on training data (n = 94)
Metric Measure Log_reg Step_reg Ridge_reg Lasso_reg Elnet NA. NA..1
specificity mean 0.720 specificity mean 0.840 0.840 0.860 0.860
specificity std_err 0.053 specificity std_err 0.058 0.040 0.060 0.043
accuracy mean 0.553 accuracy mean 0.714 0.544 0.516 0.545
accuracy std_err 0.059 accuracy std_err 0.049 0.040 0.023 0.041
bal_accuracy mean 0.532 bal_accuracy mean 0.700 0.510 0.480 0.510
bal_accuracy std_err 0.062 bal_accuracy std_err 0.049 0.041 0.022 0.042
recall mean 0.345 recall mean 0.560 0.180 0.100 0.160
recall std_err 0.094 recall std_err 0.055 0.052 0.055 0.046
roc_auc mean 0.509 roc_auc mean 0.658 0.552 0.598 0.601
roc_auc std_err 0.076 roc_auc std_err 0.052 0.072 0.049 0.062
sensitivity mean 0.345 sensitivity mean 0.560 0.180 0.100 0.160
sensitivity std_err 0.094 sensitivity std_err 0.055 0.052 0.055 0.046
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
#Plot ROC:s on final models fit on full training data
bind_rows(lr_train_roc, stepwise_train_roc, ridge_train_roc,lasso_train_roc, elnet_train_roc) %>% 
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) + 
  geom_path(lwd = 1.5, alpha = 0.8) +
  geom_abline(lty = 3) + 
  coord_equal() + 
  scale_color_viridis_d(option = "plasma", end = .6)+
  ggtitle("ROC Based on Training Data")

######################################################
# SUMMARIZE RESULTS BASED ON TEST DATA EVALUATION
######################################################

#Prepare results for producing a table
lr_results_test <- tibble(Gender = data_test$Gender, lr_pred_prob_test, lr_pred_class_test)
step_results_test <- tibble(Gender = data_test$Gender, step_pred_prob_test, step_pred_class_test)
ridge_results_test <- tibble(Gender = data_test$Gender, ridge_pred_prob_test, ridge_pred_class_test)
lasso_results_test <- tibble(Gender = data_test$Gender, lasso_pred_prob_test, lasso_pred_class_test)
elnet_results_test <- tibble(Gender = data_test$Gender, elnet_pred_prob_test, elnet_pred_class_test)

lr_metrics_sub_test <- bind_rows(roc_auc(lr_results_test, Gender, .pred_Female)[, c(1,3)],
                                 accuracy(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 bal_accuracy(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 f_meas(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 precision(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 recall(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 specificity(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  sensitivity(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])


step_metrics_sub_test <- bind_rows(roc_auc(step_results_test, Gender, .pred_Female)[, c(1,3)],
                                    accuracy(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                    bal_accuracy(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                    f_meas(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                    precision(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                    recall(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                    specificity(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                    sensitivity(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])


ridge_metrics_sub_test <- bind_rows(roc_auc(ridge_results_test, Gender, .pred_Female)[, c(1,3)],
                                  accuracy(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  bal_accuracy(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  f_meas(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  precision(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  recall(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  specificity(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  sensitivity(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])


lasso_metrics_sub_test <- bind_rows(roc_auc(lasso_results_test, Gender, .pred_Female)[, c(1,3)],
                                 accuracy(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 bal_accuracy(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 f_meas(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 precision(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 recall(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 specificity(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                 sensitivity(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])

elnet_metrics_sub_test <- bind_rows(roc_auc(elnet_results_test, Gender, .pred_Female)[, c(1,3)],
                                  accuracy(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  bal_accuracy(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  f_meas(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  precision(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  recall(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  specificity(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
                                  sensitivity(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])


#Combine the results from different models in one tibble
metrics_table_data_test <- bind_cols(lr_metrics_sub_test[, 1:2], step_metrics_sub_test[, 2], ridge_metrics_sub_test[, 2], lasso_metrics_sub_test[, 2], elnet_metrics_sub_test[, 2])
## New names:
## • `.estimate` -> `.estimate...2`
## • `.estimate` -> `.estimate...3`
## • `.estimate` -> `.estimate...4`
## • `.estimate` -> `.estimate...5`
## • `.estimate` -> `.estimate...6`
colnames(metrics_table_data_test) <- c("Metric", "Logistic", "Stepwise", "Ridge", "Lasso", "Elnet")


#Convert the tibble to a data.frame
results_table_test <- data.frame(metrics_table_data_test)

#Produce a table with results based on test data
results_table_test %>%
  kbl(caption = "Table 4. Model performance evaluated on test data (n = 40)",  digits = 3) %>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  collapse_rows(columns = 1, valign = "top")
Table 4. Model performance evaluated on test data (n = 40)
Metric Logistic Stepwise Ridge Lasso Elnet
roc_auc 0.751 0.711 0.761 0.518 0.688
accuracy 0.634 0.683 0.610 0.561 0.561
bal_accuracy 0.620 0.663 0.580 0.518 0.518
f_meas 0.545 0.581 0.429 0.250 0.250
precision 0.600 0.692 0.600 0.500 0.500
recall 0.500 0.500 0.333 0.167 0.167
specificity 0.739 0.826 0.826 0.870 0.870
sensitivity 0.500 0.500 0.333 0.167 0.167
#Plot ROC:s using final models fit on full training data to predict on test data
lr_test_roc <- roc_curve(tibble(Gender = data_test$Gender, lr_pred_prob_test), truth = Gender, estimate =.pred_Female)  %>% 
  mutate(model = "Log_reg")
stepwise_test_roc <- roc_curve(tibble(Gender = data_test$Gender, step_pred_prob_test), truth = Gender, estimate =.pred_Female)  %>% 
  mutate(model = "Stepwise_reg")
ridge_test_roc <- roc_curve(tibble(Gender = data_test$Gender, ridge_pred_prob_test), truth = Gender, estimate =.pred_Female)  %>% 
  mutate(model = "Ridge_reg")
lasso_test_roc <- roc_curve(tibble(Gender = data_test$Gender, lasso_pred_prob_test), truth = Gender, estimate =.pred_Female)  %>% 
  mutate(model = "Lasso_reg")
elnet_test_roc <- roc_curve(tibble(Gender = data_test$Gender, elnet_pred_prob_test), truth = Gender, estimate =.pred_Female)  %>% 
  mutate(model = "Elnet_reg")


bind_rows(lr_test_roc, stepwise_test_roc, ridge_test_roc,lasso_test_roc, elnet_test_roc) %>% 
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) + 
  geom_path(lwd = 1.5, alpha = 0.8, position=position_dodge(width=0.05)) +
  geom_abline(lty = 3) + 
  coord_equal() + 
  scale_color_viridis_d(option = "turbo", end = .6)+
  ggtitle("ROC Based on Test Data")
## Warning: `position_dodge()` requires non-overlapping x intervals

yes, this is “legal”. If the jump from one threshold to the next raises the amount of false positives and false negatives together the result is a diagonal line. Two reasons that might happen:

You have 2 observations with same threshold but with different ground truth The resolution between 2 thresholds is large enough - in that case you may also check a threshold between the two.

lasso_tune  %>%
  collect_metrics()
######################################################
# LASSO PLOTS
######################################################
lasso_tune %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  #geom_errorbar(aes(
  #  ymin = mean - std_err,
   # ymax = mean + std_err
  #),
  #alpha = 0.5
  #) +
  geom_line(size = 1.5) +
  facet_wrap(~.metric, scales = "free", nrow = 2) +
  scale_x_log10() +
  theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

best_auc <- lasso_tune %>%
  select_best("roc_auc", maximize = T)
## Warning: The `maximize` argument is no longer needed. This value was ignored.
lasso_fit_train %>%
  fit(data_train) %>%
  pull_workflow_fit() %>%
  vi(lambda = best_auc$penalty) %>%
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL) +
  ggtitle("Lasso Model - Explanatory Variable Importance")

######################################################
# RIDGE PLOTS
######################################################
ridge_tune %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  #geom_errorbar(aes(
  #  ymin = mean - std_err,
   # ymax = mean + std_err
  #),
  #alpha = 0.5
  #) +
  geom_line(size = 1.5) +
  facet_wrap(~.metric, scales = "free", nrow = 2) +
  scale_x_log10() +
  theme(legend.position = "none")

best_auc <- ridge_tune %>%
  select_best("roc_auc", maximize = T)
## Warning: The `maximize` argument is no longer needed. This value was ignored.
ridge_fit_train %>%
  fit(data_train) %>%
  pull_workflow_fit() %>%
  vi(lambda = best_auc$penalty) %>%
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL) +
  ggtitle("Ridge Regression Model - Explanatory Variable Importance")

######################################################
# ELNET PLOTS
######################################################
elnet_tune %>%  
  collect_metrics() %>% 
  ggplot(aes(penalty, mean, color = .metric)) +
  #geom_errorbar(aes(
  #  ymin = mean - std_err,
   # ymax = mean + std_err
  #),
  #alpha = 0.5
  #) +
  geom_line(size = .5) +
  facet_wrap(~.metric , scales = "free", nrow = 6) +
  scale_x_log10() +
  theme(legend.position = "none")

best_auc <- elnet_tune %>%
  select_best("roc_auc", maximize = T)
## Warning: The `maximize` argument is no longer needed. This value was ignored.
elnet_fit_train %>%
  fit(data_train) %>%
  pull_workflow_fit() %>%
  vi(lambda = best_auc$penalty, num_features =20) %>%
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL) +
  ggtitle("Elastic Net Model - Explanatory Variable Importance")

######################################################
# STEPWISE PLOTS
######################################################
vi((step_fit_train$fit$fit$fit)) %>%
    mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL) +
  ggtitle("Stepwise Logisitc Regression - Explanatory Variable Importance")

######################################################
# LR PLOTS
######################################################
vi((lr_fit_train$fit$fit$fit )) %>% 
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL) +
  ggtitle("Logistic Regression - Explanatory Variable Importance")

lasso_fit_train$fit$fit$spec
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 0.124588336429501
##   mixture = 1
## 
## Computational engine: glmnet 
## 
## Model fit template:
## glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
##     alpha = 1, family = "binomial")
ridge_fit_train$fit$fit$spec
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 1
##   mixture = 0
## 
## Computational engine: glmnet 
## 
## Model fit template:
## glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
##     alpha = 0, family = "binomial")
elnet_fit_train$fit$fit$spec
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 0.792482898353919
##   mixture = 0.05
## 
## Computational engine: glmnet 
## 
## Model fit template:
## glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
##     alpha = 0.05, family = "binomial")
######################################################
# GODNESS OF FIT
######################################################
train_x <- model.matrix(Gender ~ ., data_train)[, -1]
train_y <- data_train$Gender

test_x <- model.matrix(Gender ~ ., data_test)[, -1]
test_y <- data_test$Gender

lasso_mod <- (glmnet(test_x, test_y, family = "binomial", alpha = 1, lambda =0.024658110758226))
ridge_mod <- (glmnet(test_x, test_y, family = "binomial", alpha = 0, lambda =0.0195639834351706))
elnet_mod <- (glmnet(test_x, test_y, family = "binomial", alpha = 1, lambda =0.0242012826479438))

lr_deviance <- lr_fit_train$fit$fit$fit$deviance
#lr_fit_train$fit$fit$fit$aic%>% tidy()
step_deviance <-step_fit_train$fit$fit$fit$deviance
#step_fit_train$fit$fit$fit$aic%>% tidy()
lasso_deviance <- deviance(lasso_mod)
ridge_deviance <- deviance(ridge_mod)
elnet_deviance <- deviance(elnet_mod)

deviances <- cbind("Deviance", lr_deviance, step_deviance, ridge_deviance, lasso_deviance, elnet_deviance)


#p-value = 1 - pchisq(deviance, degrees of freedom)
summary(step_fit_train$fit$fit$fit)
## 
## Call:
## stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9318  -1.0373   0.8933   0.8933   2.0162  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                  -0.3388     0.5767  -0.588  0.55686   
## CM_IHD_Yes                    0.9852     0.8424   1.170  0.24219   
## Lung_Disease_Yes             -1.7662     1.3218  -1.336  0.18148   
## Symptom_Fever_Yes             1.0514     0.6095   1.725  0.08452 . 
## Symptom_Nausea_Vomiting_Yes  -1.5532     0.5547  -2.800  0.00511 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 128.05  on 92  degrees of freedom
## Residual deviance: 113.89  on 88  degrees of freedom
## AIC: 123.89
## 
## Number of Fisher Scoring iterations: 4
lr_fit_train$fit$fit$fit$df.residual
## [1] 74
step_fit_train$fit$fit$fit$df.residual
## [1] 88
lasso_mod$df
## [1] 12
ridge_mod$df
## [1] 18
elnet_mod$df
## [1] 12
p_lr_dev <- 1 - pchisq(lr_deviance, lr_fit_train$fit$fit$fit$df.residual)
p_lr_dev
## [1] 0.01543401
p_step_dev <- 1 - pchisq(step_deviance, step_fit_train$fit$fit$fit$df.residual)
p_step_dev
## [1] 0.03313262
p_ridge_dev <- 1- pchisq(ridge_deviance, ridge_mod$df)
p_ridge_dev
## [1] 0.2729321
p_lasso_dev <- 1- pchisq(lasso_deviance, lasso_mod$df)
p_lasso_dev
## [1] 0.009091613
p_elnet_dev <- 1- pchisq(elnet_deviance, elnet_mod$df)
p_elnet_dev
## [1] 0.009673136
lasso_coeff
#LASSO
tLL <- -deviance(lasso_mod)
k <- lasso_mod$df
n <- lasso_mod$nobs
AICc  <- -tLL+2*k+2*k*(k+1)/(n-k-1)
AIC_ <- -tLL+2*k
BIC<-log(n)*k - tLL

h <- c("Model", "DF", "AIC", "BIC", "AICc", "Deviance")
l <- c("Lasso",k, AIC_, BIC, AICc, lasso_deviance)
#RIDGE
tLL <- -deviance(ridge_mod)
k <- ridge_mod$df
n <- ridge_mod$nobs
AICc  <- -tLL+2*k+2*k*(k+1)/(n-k-1)
AIC_ <- -tLL+2*k
BIC<-log(n)*k - tLL
r <- c("Ridge",k, AIC_, BIC, AICc, ridge_deviance)
#ELNET
tLL <- -deviance(elnet_mod)
k <- elnet_mod$df
n <- elnet_mod$nobs
AICc  <- -tLL+2*k+2*k*(k+1)/(n-k-1)
AIC_ <- -tLL+2*k
BIC<-log(n)*k - tLL
e <- c("Elnet",k, AIC_, BIC, AICc, elnet_deviance)
rbind(h,r,l, e)
##   [,1]    [,2] [,3]               [,4]               [,5]              
## h "Model" "DF" "AIC"              "BIC"              "AICc"            
## r "Ridge" "18" "57.1294591736847" "87.9737563743623" "88.2203682645938"
## l "Lasso" "12" "50.5074134032682" "71.0702782037199" "61.6502705461253"
## e "Elnet" "12" "50.3184903173487" "70.8813551178004" "61.4613474602058"
##   [,6]              
## h "Deviance"        
## r "21.1294591736847"
## l "26.5074134032682"
## e "26.3184903173487"
#LR

AIC_<-lr_fit_train$fit$fit$fit$aic
k <-lr_fit_train$fit$fit$fit$df.residual
AICc <-AICc(lr_fit_train$fit$fit$fit)
BIC <-BIC(lr_fit_train$fit$fit$fit)
lr <- c("Logistic",k, AIC_, BIC, AICc, lr_deviance)
#STEP

AIC_<-step_fit_train$fit$fit$fit$aic
k <-step_fit_train$fit$fit$fit$df.residual
AICc <- AICc(step_fit_train$fit$fit$fit)
BIC <-BIC(step_fit_train$fit$fit$fit)
s <- c("Stepwise",k, AIC_, BIC, AICc, step_deviance)


models <- data.frame(rbind(h,lr, s,r,l, e))

names(models) <- models[1,]
models <- models[-1,]
rownames(models) <- NULL
models <- models %>% 
  mutate_at("AIC", as.numeric) %>% 
  mutate_at("AICc", as.numeric)%>% 
  mutate_at("BIC", as.numeric)%>% 
  mutate_at("Deviance", as.numeric) %>% mutate_if(is.numeric, round,digits=2)
models %>%
  kbl(caption = "Table x. )") %>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  collapse_rows(columns = 1, valign = "top")
Table x. )
Model DF AIC BIC AICc Deviance
Logistic 74 140.65 188.77 151.06 102.65
Stepwise 88 123.89 136.55 124.58 113.89
Ridge 18 57.13 87.97 88.22 21.13
Lasso 12 50.51 71.07 61.65 26.51
Elnet 12 50.32 70.88 61.46 26.32
save(models, file = "model_compare.RData")
hist(elnet_pred_prob_train$.pred_Female)

hist(elnet_pred_prob_train$.pred_Male)

summary(lasso_mod$dev.ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5286  0.5286  0.5286  0.5286  0.5286  0.5286
ridge_mod$dev.ratio
## [1] 0.6242101
elnet_mod$dev.ratio
## [1] 0.5319226
plot(residuals(step_fit_train$fit$fit$fit))

plot(residuals(lr_fit_train$fit$fit$fit))

#plot(residuals(lasso_fit_train$fit$fit$fit$nobs))
plot(lasso_fit_train$fit$fit$fit)

plot_glmnet(elnet_fit_train$fit$fit$fit ,
xvar = c("rlambda", "lambda", "norm", "dev"),
label = 10, nresponse = NA, grid.col = NA, s = NA)

moo <-glmnet(train_x, train_y, family = "binomial", alpha = 1, lambda =0.024658110758226)
plotres(moo,w1.col=1:9, w1.nresponse="Male", w1.xvar = c( "dev"))

elnet_mod <- (glmnet::glmnet(test_x, test_y, family = "binomial", alpha = 1, lambda =0.0242012826479438, type.measure = "deviance"))


elnet_mod$nulldev
## [1] 56.22679
library (sure)

resid(elnet_pred_class_test)
## Warning: Unknown or uninitialised column: `na.action`.
## Warning: Unknown or uninitialised column: `residuals`.
## NULL
elnet_fit_rs$.predictions
## [[1]]
## # A tibble: 11 × 6
##    .pred_Female .pred_Male  .row .pred_class Gender .config             
##           <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
##  1        0.446      0.554    18 Male        Female Preprocessor1_Model1
##  2        0.410      0.590    21 Male        Female Preprocessor1_Model1
##  3        0.421      0.579    23 Male        Female Preprocessor1_Model1
##  4        0.510      0.490    35 Female      Female Preprocessor1_Model1
##  5        0.501      0.499    42 Female      Female Preprocessor1_Model1
##  6        0.421      0.579    52 Male        Male   Preprocessor1_Model1
##  7        0.414      0.586    57 Male        Male   Preprocessor1_Model1
##  8        0.427      0.573    62 Male        Male   Preprocessor1_Model1
##  9        0.427      0.573    78 Male        Male   Preprocessor1_Model1
## 10        0.412      0.588    85 Male        Male   Preprocessor1_Model1
## 11        0.426      0.574    92 Male        Male   Preprocessor1_Model1
## 
## [[2]]
## # A tibble: 10 × 6
##    .pred_Female .pred_Male  .row .pred_class Gender .config             
##           <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
##  1        0.416      0.584     6 Male        Female Preprocessor1_Model1
##  2        0.493      0.507    17 Male        Female Preprocessor1_Model1
##  3        0.446      0.554    20 Male        Female Preprocessor1_Model1
##  4        0.535      0.465    31 Female      Female Preprocessor1_Model1
##  5        0.464      0.536    41 Male        Female Preprocessor1_Model1
##  6        0.527      0.473    71 Female      Male   Preprocessor1_Model1
##  7        0.415      0.585    72 Male        Male   Preprocessor1_Model1
##  8        0.414      0.586    77 Male        Male   Preprocessor1_Model1
##  9        0.416      0.584    83 Male        Male   Preprocessor1_Model1
## 10        0.415      0.585    91 Male        Male   Preprocessor1_Model1
## 
## [[3]]
## # A tibble: 9 × 6
##   .pred_Female .pred_Male  .row .pred_class Gender .config             
##          <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
## 1        0.430      0.570     4 Male        Female Preprocessor1_Model1
## 2        0.434      0.566    12 Male        Female Preprocessor1_Model1
## 3        0.393      0.607    32 Male        Female Preprocessor1_Model1
## 4        0.399      0.601    38 Male        Female Preprocessor1_Model1
## 5        0.404      0.596    45 Male        Male   Preprocessor1_Model1
## 6        0.427      0.573    47 Male        Male   Preprocessor1_Model1
## 7        0.427      0.573    50 Male        Male   Preprocessor1_Model1
## 8        0.393      0.607    69 Male        Male   Preprocessor1_Model1
## 9        0.525      0.475    86 Female      Male   Preprocessor1_Model1
## 
## [[4]]
## # A tibble: 9 × 6
##   .pred_Female .pred_Male  .row .pred_class Gender .config             
##          <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
## 1        0.419      0.581    15 Male        Female Preprocessor1_Model1
## 2        0.432      0.568    19 Male        Female Preprocessor1_Model1
## 3        0.442      0.558    33 Male        Female Preprocessor1_Model1
## 4        0.485      0.515    36 Male        Female Preprocessor1_Model1
## 5        0.430      0.570    54 Male        Male   Preprocessor1_Model1
## 6        0.519      0.481    73 Female      Male   Preprocessor1_Model1
## 7        0.411      0.589    74 Male        Male   Preprocessor1_Model1
## 8        0.409      0.591    76 Male        Male   Preprocessor1_Model1
## 9        0.488      0.512    79 Male        Male   Preprocessor1_Model1
## 
## [[5]]
## # A tibble: 9 × 6
##   .pred_Female .pred_Male  .row .pred_class Gender .config             
##          <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
## 1        0.438      0.562    26 Male        Female Preprocessor1_Model1
## 2        0.490      0.510    27 Male        Female Preprocessor1_Model1
## 3        0.455      0.545    29 Male        Female Preprocessor1_Model1
## 4        0.418      0.582    39 Male        Female Preprocessor1_Model1
## 5        0.460      0.540    43 Male        Male   Preprocessor1_Model1
## 6        0.436      0.564    51 Male        Male   Preprocessor1_Model1
## 7        0.423      0.577    56 Male        Male   Preprocessor1_Model1
## 8        0.431      0.569    68 Male        Male   Preprocessor1_Model1
## 9        0.535      0.465    88 Female      Male   Preprocessor1_Model1
## 
## [[6]]
## # A tibble: 9 × 6
##   .pred_Female .pred_Male  .row .pred_class Gender .config             
##          <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
## 1        0.429      0.571     8 Male        Female Preprocessor1_Model1
## 2        0.551      0.449    10 Female      Female Preprocessor1_Model1
## 3        0.450      0.550    28 Male        Female Preprocessor1_Model1
## 4        0.432      0.568    30 Male        Female Preprocessor1_Model1
## 5        0.424      0.576    44 Male        Male   Preprocessor1_Model1
## 6        0.432      0.568    48 Male        Male   Preprocessor1_Model1
## 7        0.422      0.578    53 Male        Male   Preprocessor1_Model1
## 8        0.422      0.578    59 Male        Male   Preprocessor1_Model1
## 9        0.433      0.567    80 Male        Male   Preprocessor1_Model1
## 
## [[7]]
## # A tibble: 9 × 6
##   .pred_Female .pred_Male  .row .pred_class Gender .config             
##          <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
## 1        0.531      0.469     1 Female      Female Preprocessor1_Model1
## 2        0.389      0.611     2 Male        Female Preprocessor1_Model1
## 3        0.436      0.564    11 Male        Female Preprocessor1_Model1
## 4        0.446      0.554    24 Male        Female Preprocessor1_Model1
## 5        0.444      0.556    49 Male        Male   Preprocessor1_Model1
## 6        0.445      0.555    65 Male        Male   Preprocessor1_Model1
## 7        0.431      0.569    66 Male        Male   Preprocessor1_Model1
## 8        0.416      0.584    89 Male        Male   Preprocessor1_Model1
## 9        0.417      0.583    93 Male        Male   Preprocessor1_Model1
## 
## [[8]]
## # A tibble: 9 × 6
##   .pred_Female .pred_Male  .row .pred_class Gender .config             
##          <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
## 1        0.416      0.584     3 Male        Female Preprocessor1_Model1
## 2        0.483      0.517     5 Male        Female Preprocessor1_Model1
## 3        0.526      0.474    14 Female      Female Preprocessor1_Model1
## 4        0.403      0.597    40 Male        Female Preprocessor1_Model1
## 5        0.483      0.517    55 Male        Male   Preprocessor1_Model1
## 6        0.424      0.576    58 Male        Male   Preprocessor1_Model1
## 7        0.438      0.562    63 Male        Male   Preprocessor1_Model1
## 8        0.487      0.513    64 Male        Male   Preprocessor1_Model1
## 9        0.605      0.395    90 Female      Male   Preprocessor1_Model1
## 
## [[9]]
## # A tibble: 9 × 6
##   .pred_Female .pred_Male  .row .pred_class Gender .config             
##          <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
## 1        0.463      0.537     7 Male        Female Preprocessor1_Model1
## 2        0.441      0.559    13 Male        Female Preprocessor1_Model1
## 3        0.487      0.513    22 Male        Female Preprocessor1_Model1
## 4        0.563      0.437    25 Female      Female Preprocessor1_Model1
## 5        0.495      0.505    61 Male        Male   Preprocessor1_Model1
## 6        0.446      0.554    67 Male        Male   Preprocessor1_Model1
## 7        0.440      0.560    70 Male        Male   Preprocessor1_Model1
## 8        0.437      0.563    75 Male        Male   Preprocessor1_Model1
## 9        0.430      0.570    84 Male        Male   Preprocessor1_Model1
## 
## [[10]]
## # A tibble: 9 × 6
##   .pred_Female .pred_Male  .row .pred_class Gender .config             
##          <dbl>      <dbl> <int> <fct>       <fct>  <chr>               
## 1        0.425      0.575     9 Male        Female Preprocessor1_Model1
## 2        0.456      0.544    16 Male        Female Preprocessor1_Model1
## 3        0.398      0.602    34 Male        Female Preprocessor1_Model1
## 4        0.480      0.520    37 Male        Female Preprocessor1_Model1
## 5        0.432      0.568    46 Male        Male   Preprocessor1_Model1
## 6        0.446      0.554    60 Male        Male   Preprocessor1_Model1
## 7        0.508      0.492    81 Female      Male   Preprocessor1_Model1
## 8        0.611      0.389    82 Female      Male   Preprocessor1_Model1
## 9        0.442      0.558    87 Male        Male   Preprocessor1_Model1
t1 <-lr_fit_train$fit$fit$fit %>% tidy() 
t2 <- step_fit_train$fit$fit$fit  %>% tidy()
t3 <- ridge_mod  %>% tidy() %>% dplyr::select(term, estimate)
t4 <- lasso_mod  %>% tidy() %>% dplyr::select(term, estimate)
t5 <- elnet_mod  %>% tidy() %>% dplyr::select(term, estimate)
t6 <- results_table_test
t6 <-t6[-(2:6),]
rownames(t6) <- NULL
save(t1, file = "table1.RData")
save(t2, file = "table2.RData")
save(ridge_coeff, file = "table3.RData")
save(lasso_coeff, file = "table4.RData")
save(elnet_coeff, file = "table5.RData")
save(t6, file = "table6.RData")
lambda_plot <- glmnet::cv.glmnet(train_x, train_y, family = "binomial", alpha =1, type.measure = "auc", nfolds=9, nlamda=100000)
autoplot(lambda_plot, colour = 'red')

plot(lambda_plot)

######################################################
# Multicollinearity test
######################################################

vif_table <-vif(lr_fit_train$fit$fit$fit) %>%
  tidy() %>% 
  rename(VIF = x) %>% 
  rename(Variable=names)
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
save(vif_table, file = "table_appendix1.RData")
vif(step_fit_train$fit$fit$fit) %>%
  tidy() %>% 
  rename(VIF = x) %>% 
  rename(Variable=names)
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
######################################################
# Linearity test
######################################################
crPlots(glm(Gender ~ (Age), 
  data=final_data, family=binomial), smooth=list(span=1))

crPlots(glm(Gender ~ BMI, 
  data=final_data, family=binomial), smooth=list(span=1))

cnf_test <- confusion.glmnet(elnet_mod, newx = test_x, newy = test_y)
cnf_test
##          True
## Predicted Female Male Total
##    Female     16    3    19
##    Male        2   20    22
##    Total      18   23    41
## 
##  Percent Correct:  0.878
cnf_test %>% tidy() %>%
plot_confusion_matrix( 
                      target_col = "True", 
                      prediction_col = "Predicted",
                      counts_col = "n")
## Warning: 'tidy.table' is deprecated.
## Use 'tibble::as_tibble()' instead.
## See help("Deprecated")
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'rsvg' is missing. Will not plot arrows and zero-shading.

cnfmat_lr <- read_excel(here("data","conf_mat_lr.xlsx"))
cnfmat_step <- read_excel(here("data","conf_mat_lstep.xlsx"))

lr_results_test
confusion_matrix(lr_results_test$Gender, lr_results_test$.pred_class)
cmat <- conf_mat(lr_results_test, truth = "Gender", estimate = ".pred_class")
cnf_test %>% tidy()
## Warning: 'tidy.table' is deprecated.
## Use 'tibble::as_tibble()' instead.
## See help("Deprecated")
cnfmat_lr %>% plot_confusion_matrix( 
                      target_col = "True", 
                      prediction_col = "Predicted",
                      counts_col = "n")
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'rsvg' is missing. Will not plot arrows and zero-shading.

cnfmat_step %>% plot_confusion_matrix( 
                      target_col = "True", 
                      prediction_col = "Predicted",
                      counts_col = "n")
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'ggimage' is missing. Will not plot arrows and zero-shading.

## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'rsvg' is missing. Will not plot arrows and zero-shading.

autoplot(glm(formula = Gender~ ., family = "binomial", data = data_train), which = 1:6, label.size = 3)
## Warning: not plotting observations with leverage one:
##   18, 19
## Warning: Removed 2 rows containing missing values (`geom_line()`).
## Warning: Removed 2 rows containing missing values (`geom_segment()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_line()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_line()`).

gvif_ <- gvif(glm(formula = Gender~ ., family = "binomial", data = data_train))
##                            GVIF df GVIF^(1/(2*df))
## Age                      1.4694  1          1.2122
## BMI                      1.3675  1          1.1694
## CM_DM                    1.4288  1          1.1953
## CM_HTM                   1.5279  1          1.2361
## CM_IHD                   1.4757  1          1.2148
## Lung_Disease             1.1400  1          1.0677
## Symptom_Fever            1.3112  1          1.1451
## Symptom_Cough            1.4049  1          1.1853
## Symptom_Sputum           1.3968  1          1.1819
## Symptom_Runny_Nose       1.1762  1          1.0845
## Symptom_Breathlessness   1.4578  1          1.2074
## Symptom_Abdominal_Pain   1.5726  1          1.2540
## Symptom_Nausea_Vomiting  1.5609  1          1.2494
## Symptom_Diarrhoea        1.2240  1          1.1063
## Symptom_Anosmia_Hyposmia 1.2525  1          1.1192
## Symptom_Red_Eye          1.0000  1          1.0000
## Symptom_Skin_Rash        1.0000  1          1.0000
## Outcome                  1.2259  1          1.1072
t10 <- (gvif_)
save(t10, file = "gvif.RData")
car::vif(glm(formula = Gender~ ., family = "binomial", data = data_train))
##                      Age                      BMI                    CM_DM 
##                 1.469415                 1.367541                 1.428762 
##                   CM_HTM                   CM_IHD             Lung_Disease 
##                 1.527945                 1.475719                 1.139998 
##            Symptom_Fever            Symptom_Cough           Symptom_Sputum 
##                 1.311183                 1.404891                 1.396843 
##       Symptom_Runny_Nose   Symptom_Breathlessness   Symptom_Abdominal_Pain 
##                 1.176179                 1.457801                 1.572631 
##  Symptom_Nausea_Vomiting        Symptom_Diarrhoea Symptom_Anosmia_Hyposmia 
##                 1.560862                 1.224015                 1.252535 
##          Symptom_Red_Eye        Symptom_Skin_Rash                  Outcome 
##                 1.000000                 1.000000                 1.225894